In [None]:
!pip install xgboost



In [None]:
# import library
import cv2
import glob
import csv
import itertools
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score
from google.colab import drive
# mount google drive, required passcode from chosen google account
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# constant
drive_url = "/content/drive/Shared drives/Computer Vision"
dataset_url = "/content/drive/Shared drives/Computer Vision/dataset"
csv_url = "/content/drive/Shared drives/Computer Vision/CSV/"
# from 24 data
train_dataset_size = 18

In [None]:
# prepare dataset
name_list = ["1","2","3","4","5","ก","ง2","จ1","จ2","ฉ1","ซ2","ด","ต","ท1","น","บ","พ","ฟ","ม","ย","ร","ล","ว","ส","ห","อ"]
train_dataset = []
test_dataset = []
train_label = []
test_label = []
for idx, name in enumerate(name_list):
  # read csv
  df=pd.read_csv( csv_url + name + '.csv')
  t_all_data = df.to_numpy()[:,1:]
  # remove z column
  t_all_data = t_all_data[:,[True,True,False]*21]
  # split into train and test
  t_train_dataset = t_all_data[0:train_dataset_size]
  t_test_dataset = t_all_data[train_dataset_size:24]
  # create label and split
  t_label_data = np.full(24, idx)
  t_train_label = t_label_data[0:train_dataset_size]
  t_test_label = t_label_data[train_dataset_size:24]
  # add temp dataset to list
  train_dataset.append(t_train_dataset)
  test_dataset.append(t_test_dataset)
  train_label.append(t_train_label)
  test_label.append(t_test_label)

train_dataset = np.concatenate(train_dataset)
test_dataset = np.concatenate(test_dataset)
train_label = np.concatenate(train_label)
test_label = np.concatenate(test_label)
# prepare xgb DMatrix
xgb_train = xgb.DMatrix(train_dataset, label=train_label)
xgb_test = xgb.DMatrix(test_dataset, label=test_label)

In [None]:
# train parameter
xgb_params_tree = {
    'eta':0.05,
    'booster':"gbtree",
    'max_depth':2,
    'gamma':0,
    'subsample':0.5,
    'min_child_weight':1,
    'colsample_bytree':0.01,
    'objective':"multi:softprob",
    'eval_metric':"merror",
    'num_class':26
}
xgb_params_linear = {
    'booster':"gblinear",
    'objective':"multi:softprob",
    'eval_metric':"merror",
    'num_class':26
}
# training
xgb_model = xgb.train(params=xgb_params_linear, \
                      dtrain=xgb_train, \
                      num_boost_round=100, \
                      evals=[(xgb_train,'xgb_train'),(xgb_test,'xgb_test')], \
                      early_stopping_rounds=50)

[0]	xgb_train-merror:0.185897	xgb_test-merror:0.262821
Multiple eval metrics have been passed: 'xgb_test-merror' will be used for early stopping.

Will train until xgb_test-merror hasn't improved in 50 rounds.
[1]	xgb_train-merror:0.115385	xgb_test-merror:0.185897
[2]	xgb_train-merror:0.087607	xgb_test-merror:0.134615
[3]	xgb_train-merror:0.070513	xgb_test-merror:0.108974
[4]	xgb_train-merror:0.061966	xgb_test-merror:0.102564
[5]	xgb_train-merror:0.053419	xgb_test-merror:0.096154
[6]	xgb_train-merror:0.049145	xgb_test-merror:0.089744
[7]	xgb_train-merror:0.049145	xgb_test-merror:0.089744
[8]	xgb_train-merror:0.049145	xgb_test-merror:0.083333
[9]	xgb_train-merror:0.038462	xgb_test-merror:0.064103
[10]	xgb_train-merror:0.036325	xgb_test-merror:0.057692
[11]	xgb_train-merror:0.036325	xgb_test-merror:0.057692
[12]	xgb_train-merror:0.036325	xgb_test-merror:0.051282
[13]	xgb_train-merror:0.032051	xgb_test-merror:0.044872
[14]	xgb_train-merror:0.029915	xgb_test-merror:0.038462
[15]	xgb_train-

In [None]:
# predict
# train dataset
preds = xgb_model.predict(xgb_train)
best_preds = np.asarray([np.argmax(line) for line in preds])
# print(preds)
# print(best_preds)
print("train set score:",precision_score(train_label, best_preds, average='macro'))
# test dataset
preds = xgb_model.predict(xgb_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
# print(preds)
# print(best_preds)
print("test set score:",precision_score(test_label, best_preds, average='macro'))

train set score: 0.9857142857142857
test set score: 0.9565018315018315


In [None]:
# list wrong predict from best_preds
counter = 0
for name in name_list:
  for _ in range(6):
    if name_list[best_preds[counter]] != name:
      print('answer:',name,'predict: ',name_list[best_preds[counter]])
    counter += 1

answer: จ1 predict:  ย
answer: จ1 predict:  ย
answer: ด predict:  1
answer: ต predict:  ท1
answer: บ predict:  ฟ
answer: ม predict:  จ2
answer: อ predict:  ส


In [None]:
# params tune
_booster = ['gblinear']
_eta = [0.01,0.05,0.1]
_max_depth = [2]
_gamma = [0]
_subsample = [0.3,0.5,0.7]
_min_child_weight = [1]
_num_boost_round = [100]

# loop
for _b,_e,_ma,_g,_s,_mi,_n in itertools.product(_booster,_eta,_max_depth,_gamma,_subsample,_min_child_weight,_num_boost_round):
  # train parameter
  xgb_params = {
      'eta':_e,
      'booster':_b,
      'max_depth':_ma,
      'gamma':_g,
      'subsample':_s,
      'min_child_weight':_mi,
      'colsample_bytree':0.01,
      'objective':"multi:softprob",
      'eval_metric':"merror",
      'num_class':26,
  }
  # training
  xgb_model = xgb.train(params=xgb_params, \
                        dtrain=xgb_train, \
                        num_boost_round=_n, \
                        evals=[(xgb_train,'xgb_train'),(xgb_test,'xgb_test')], \
                        early_stopping_rounds=50, \
                        verbose_eval = False)
  # predict
  # train dataset
  preds = xgb_model.predict(xgb_train)
  best_preds = np.asarray([np.argmax(line) for line in preds])
  # print(preds)
  # print(best_preds)
  print("booster",_b,"eta",_e,"max_depth",_ma,"gamma",_g,"subsample",_s,"min_child_weight",_mi,"num_boost_round",_n,"\ntrain set score:",precision_score(train_label, best_preds, average='macro'))
  # test dataset
  preds = xgb_model.predict(xgb_test)
  best_preds = np.asarray([np.argmax(line) for line in preds])
  # print(preds)
  # print(best_preds)
  print("test set score:",precision_score(test_label, best_preds, average='macro'))

booster gblinear eta 0.01 max_depth 2 gamma 0 subsample 0.3 min_child_weight 1 num_boost_round 100 
train set score: 0.9217640194817085
test set score: 0.896565934065934
booster gblinear eta 0.01 max_depth 2 gamma 0 subsample 0.5 min_child_weight 1 num_boost_round 100 
train set score: 0.9217640194817085
test set score: 0.896565934065934
booster gblinear eta 0.01 max_depth 2 gamma 0 subsample 0.7 min_child_weight 1 num_boost_round 100 
train set score: 0.9217640194817085
test set score: 0.896565934065934
booster gblinear eta 0.05 max_depth 2 gamma 0 subsample 0.3 min_child_weight 1 num_boost_round 100 
train set score: 0.9615814611944642
test set score: 0.9546703296703297
booster gblinear eta 0.05 max_depth 2 gamma 0 subsample 0.5 min_child_weight 1 num_boost_round 100 
train set score: 0.9615814611944642
test set score: 0.9546703296703297
booster gblinear eta 0.05 max_depth 2 gamma 0 subsample 0.7 min_child_weight 1 num_boost_round 100 
train set score: 0.9615814611944642
test set sco

In [None]:
# save model
# xgb_model.save_model(drive_url + '/models/xgb_model_tree.model')
# load model
xgb_model = xgb.Booster()
xgb_model.load_model(drive_url + '/models/xgb_model_linear.model')

In [None]:
# use
name_list = ["1","2","3","4","5","ก","ง2","จ1","จ2","ฉ1","ซ2","ด","ต","ท1","น","บ","พ","ฟ","ม","ย","ร","ล","ว","ส","ห","อ"]
print(test_dataset.shape, test_dataset)
xgb_test2 = xgb.DMatrix(test_dataset)
preds = xgb_model.predict(xgb_test2)
best_preds = np.asarray([np.argmax(line) for line in preds])
print(preds)
print(best_preds)

(156, 42) [[0.7238335  1.         0.41014032 ... 0.71479389 0.89639278 0.73377014]
 [0.46485637 1.         0.14966042 ... 0.73136766 0.82685689 0.7684536 ]
 [0.76864963 1.         0.43177988 ... 0.60373442 0.89500268 0.65116277]
 ...
 [0.89486298 1.         0.52402015 ... 0.3539521  0.9556219  0.38818387]
 [0.86056473 1.         0.4758993  ... 0.3613922  0.97062143 0.41939643]
 [0.84320948 1.         0.47039529 ... 0.37663894 0.894317   0.3922176 ]]
[[9.84311283e-01 0.00000000e+00 4.15644472e-12 ... 7.69749661e-17
  5.58959186e-07 1.50432175e-17]
 [9.48601246e-01 0.00000000e+00 7.00464399e-15 ... 3.35175993e-16
  1.07347105e-07 3.65557402e-19]
 [8.10811639e-01 0.00000000e+00 4.36542746e-11 ... 2.12869522e-12
  1.18375624e-06 3.79682552e-14]
 ...
 [2.81742321e-21 4.33554738e-40 4.83074858e-10 ... 2.66825737e-05
  3.17918358e-10 9.99971390e-01]
 [4.07454242e-20 9.31639271e-41 2.89795743e-10 ... 1.00890808e-04
  1.41603881e-10 9.99865174e-01]
 [2.14040718e-18 3.28105441e-35 9.95969884e-09